@@ -107,85 +107,97 @@ module Agents |
||
107 | 107 |
log "Fetching #{options['url']}" |
108 | 108 |
request_opts = { :followlocation => true } |
109 | 109 |
request_opts[:userpwd] = options['basic_auth'] if options['basic_auth'].present? |
110 |
- request = Typhoeus::Request.new(options['url'], request_opts) |
|
111 | 110 |
|
112 |
- request.on_failure do |response| |
|
113 |
- error "Failed: #{response.inspect}" |
|
111 |
+ requests = [] |
|
112 |
+ |
|
113 |
+ if options['url'].kind_of?(Array) |
|
114 |
+ options['url'].each do |url| |
|
115 |
+ requests.push(Typhoeus::Request.new(url, request_opts)) |
|
116 |
+ end |
|
117 |
+ else |
|
118 |
+ requests.push(Typhoeus::Request.new(options['url'], request_opts)) |
|
114 | 119 |
end |
115 | 120 |
|
116 |
- request.on_success do |response| |
|
117 |
- body = response.body |
|
118 |
- if (encoding = options['force_encoding']).present? |
|
119 |
- body = body.encode(Encoding::UTF_8, encoding) |
|
121 |
+ requests.each do |request| |
|
122 |
+ request.on_failure do |response| |
|
123 |
+ error "Failed: #{response.inspect}" |
|
120 | 124 |
end |
121 |
- doc = parse(body) |
|
122 | 125 |
|
123 |
- if extract_full_json? |
|
124 |
- if store_payload!(previous_payloads(1), doc) |
|
125 |
- log "Storing new result for '#{name}': #{doc.inspect}" |
|
126 |
- create_event :payload => doc |
|
126 |
+ request.on_success do |response| |
|
127 |
+ body = response.body |
|
128 |
+ if (encoding = options['force_encoding']).present? |
|
129 |
+ body = body.encode(Encoding::UTF_8, encoding) |
|
127 | 130 |
end |
128 |
- else |
|
129 |
- output = {} |
|
130 |
- options['extract'].each do |name, extraction_details| |
|
131 |
- if extraction_type == "json" |
|
132 |
- result = Utils.values_at(doc, extraction_details['path']) |
|
133 |
- log "Extracting #{extraction_type} at #{extraction_details['path']}: #{result}" |
|
134 |
- else |
|
135 |
- case |
|
136 |
- when css = extraction_details['css'] |
|
137 |
- nodes = doc.css(css) |
|
138 |
- when xpath = extraction_details['xpath'] |
|
139 |
- nodes = doc.xpath(xpath) |
|
131 |
+ doc = parse(body) |
|
132 |
+ |
|
133 |
+ if extract_full_json? |
|
134 |
+ if store_payload!(previous_payloads(1), doc) |
|
135 |
+ log "Storing new result for '#{name}': #{doc.inspect}" |
|
136 |
+ create_event :payload => doc |
|
137 |
+ end |
|
138 |
+ else |
|
139 |
+ output = {} |
|
140 |
+ options['extract'].each do |name, extraction_details| |
|
141 |
+ if extraction_type == "json" |
|
142 |
+ result = Utils.values_at(doc, extraction_details['path']) |
|
143 |
+ log "Extracting #{extraction_type} at #{extraction_details['path']}: #{result}" |
|
140 | 144 |
else |
141 |
- error "'css' or 'xpath' is required for HTML or XML extraction" |
|
142 |
- return |
|
143 |
- end |
|
144 |
- unless Nokogiri::XML::NodeSet === nodes |
|
145 |
- error "The result of HTML/XML extraction was not a NodeSet" |
|
146 |
- return |
|
147 |
- end |
|
148 |
- result = nodes.map { |node| |
|
149 |
- if extraction_details['attr'] |
|
150 |
- node.attr(extraction_details['attr']) |
|
151 |
- elsif extraction_details['text'] |
|
152 |
- node.text() |
|
145 |
+ case |
|
146 |
+ when css = extraction_details['css'] |
|
147 |
+ nodes = doc.css(css) |
|
148 |
+ when xpath = extraction_details['xpath'] |
|
149 |
+ nodes = doc.xpath(xpath) |
|
153 | 150 |
else |
154 |
- error "'attr' or 'text' is required on HTML or XML extraction patterns" |
|
151 |
+ error "'css' or 'xpath' is required for HTML or XML extraction" |
|
152 |
+ return |
|
153 |
+ end |
|
154 |
+ unless Nokogiri::XML::NodeSet === nodes |
|
155 |
+ error "The result of HTML/XML extraction was not a NodeSet" |
|
155 | 156 |
return |
156 | 157 |
end |
157 |
- } |
|
158 |
- log "Extracting #{extraction_type} at #{xpath || css}: #{result}" |
|
158 |
+ result = nodes.map { |node| |
|
159 |
+ if extraction_details['attr'] |
|
160 |
+ node.attr(extraction_details['attr']) |
|
161 |
+ elsif extraction_details['text'] |
|
162 |
+ node.text() |
|
163 |
+ else |
|
164 |
+ error "'attr' or 'text' is required on HTML or XML extraction patterns" |
|
165 |
+ return |
|
166 |
+ end |
|
167 |
+ } |
|
168 |
+ log "Extracting #{extraction_type} at #{xpath || css}: #{result}" |
|
169 |
+ end |
|
170 |
+ output[name] = result |
|
159 | 171 |
end |
160 |
- output[name] = result |
|
161 |
- end |
|
162 | 172 |
|
163 |
- num_unique_lengths = options['extract'].keys.map { |name| output[name].length }.uniq |
|
173 |
+ num_unique_lengths = options['extract'].keys.map { |name| output[name].length }.uniq |
|
164 | 174 |
|
165 |
- if num_unique_lengths.length != 1 |
|
166 |
- error "Got an uneven number of matches for #{options['name']}: #{options['extract'].inspect}" |
|
167 |
- return |
|
168 |
- end |
|
169 |
- |
|
170 |
- old_events = previous_payloads num_unique_lengths.first |
|
171 |
- num_unique_lengths.first.times do |index| |
|
172 |
- result = {} |
|
173 |
- options['extract'].keys.each do |name| |
|
174 |
- result[name] = output[name][index] |
|
175 |
- if name.to_s == 'url' |
|
176 |
- result[name] = URI.join(options['url'], result[name]).to_s if (result[name] =~ URI::DEFAULT_PARSER.regexp[:ABS_URI]).nil? |
|
177 |
- end |
|
175 |
+ if num_unique_lengths.length != 1 |
|
176 |
+ error "Got an uneven number of matches for #{options['name']}: #{options['extract'].inspect}" |
|
177 |
+ return |
|
178 | 178 |
end |
179 |
+ |
|
180 |
+ old_events = previous_payloads num_unique_lengths.first |
|
181 |
+ num_unique_lengths.first.times do |index| |
|
182 |
+ result = {} |
|
183 |
+ options['extract'].keys.each do |name| |
|
184 |
+ result[name] = output[name][index] |
|
185 |
+ if name.to_s == 'url' |
|
186 |
+ result[name] = URI.join(options['url'], result[name]).to_s if (result[name] =~ URI::DEFAULT_PARSER.regexp[:ABS_URI]).nil? |
|
187 |
+ end |
|
188 |
+ end |
|
179 | 189 |
|
180 |
- if store_payload!(old_events, result) |
|
181 |
- log "Storing new parsed result for '#{name}': #{result.inspect}" |
|
182 |
- create_event :payload => result |
|
190 |
+ if store_payload!(old_events, result) |
|
191 |
+ log "Storing new parsed result for '#{name}': #{result.inspect}" |
|
192 |
+ create_event :payload => result |
|
193 |
+ end |
|
183 | 194 |
end |
184 | 195 |
end |
185 | 196 |
end |
197 |
+ |
|
198 |
+ hydra.queue request |
|
199 |
+ hydra.run |
|
186 | 200 |
end |
187 |
- hydra.queue request |
|
188 |
- hydra.run |
|
189 | 201 |
end |
190 | 202 |
|
191 | 203 |
private |
@@ -91,6 +91,30 @@ describe Agents::WebsiteAgent do |
||
91 | 91 |
@checker.check |
92 | 92 |
@checker.logs.first.message.should =~ /Got an uneven number of matches/ |
93 | 93 |
end |
94 |
+ |
|
95 |
+ it "should accept an array for url" do |
|
96 |
+ @site['url'] = ["http://xkcd.com/1/", "http://xkcd.com/2/"] |
|
97 |
+ @checker.options = @site |
|
98 |
+ lambda { @checker.save! }.should_not raise_error; |
|
99 |
+ lambda { @checker.check }.should_not raise_error; |
|
100 |
+ end |
|
101 |
+ |
|
102 |
+ it "should parse events from all urls in array" do |
|
103 |
+ lambda { |
|
104 |
+ @site['url'] = ["http://xkcd.com/", "http://xkcd.com/"] |
|
105 |
+ @site['mode'] = 'all' |
|
106 |
+ @checker.options = @site |
|
107 |
+ @checker.check |
|
108 |
+ }.should change { Event.count }.by(2) |
|
109 |
+ end |
|
110 |
+ |
|
111 |
+ it "should follow unique rules when parsing array of urls" do |
|
112 |
+ lambda { |
|
113 |
+ @site['url'] = ["http://xkcd.com/", "http://xkcd.com/"] |
|
114 |
+ @checker.options = @site |
|
115 |
+ @checker.check |
|
116 |
+ }.should change { Event.count }.by(1) |
|
117 |
+ end |
|
94 | 118 |
end |
95 | 119 |
|
96 | 120 |
describe 'encoding' do |